# Import essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore
from sklearn.cluster import KMeans
from matplotlib import cm
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
%matplotlib inline
carname = pd.read_csv('Car_name.csv')
carname.head()
| car_name | |
|---|---|
| 0 | chevrolet chevelle malibu |
| 1 | buick skylark 320 |
| 2 | plymouth satellite |
| 3 | amc rebel sst |
| 4 | ford torino |
carattributes = pd.read_json('Car-Attributes.json')
carattributes.head()
| mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|
| 0 | 18.0 | 8 | 307.0 | 130 | 3504 | 12.0 | 70 | 1 |
| 1 | 15.0 | 8 | 350.0 | 165 | 3693 | 11.5 | 70 | 1 |
| 2 | 18.0 | 8 | 318.0 | 150 | 3436 | 11.0 | 70 | 1 |
| 3 | 16.0 | 8 | 304.0 | 150 | 3433 | 12.0 | 70 | 1 |
| 4 | 17.0 | 8 | 302.0 | 140 | 3449 | 10.5 | 70 | 1 |
# To confirm the shape of the data we check the row size of both dataframes
carname.shape, carattributes.shape
((398, 1), (398, 8))
# The rows in both the dataframes are similar, now concat the two dataframes
car_data = pd.concat([carname,carattributes], axis=1).reindex(carname.index)
car_data.sample(50)
| car_name | mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|---|
| 302 | plymouth horizon tc3 | 34.5 | 4 | 105.0 | 70 | 2150 | 14.9 | 79 | 1 |
| 31 | toyota corona | 25.0 | 4 | 113.0 | 95 | 2228 | 14.0 | 71 | 3 |
| 96 | amc ambassador brougham | 13.0 | 8 | 360.0 | 175 | 3821 | 11.0 | 73 | 1 |
| 0 | chevrolet chevelle malibu | 18.0 | 8 | 307.0 | 130 | 3504 | 12.0 | 70 | 1 |
| 193 | ford maverick | 24.0 | 6 | 200.0 | 81 | 3012 | 17.6 | 76 | 1 |
| 386 | buick century limited | 25.0 | 6 | 181.0 | 110 | 2945 | 16.4 | 82 | 1 |
| 337 | honda accord | 32.4 | 4 | 107.0 | 72 | 2290 | 17.0 | 80 | 3 |
| 167 | toyota corolla | 29.0 | 4 | 97.0 | 75 | 2171 | 16.0 | 75 | 3 |
| 128 | chevrolet nova | 15.0 | 6 | 250.0 | 100 | 3336 | 17.0 | 74 | 1 |
| 4 | ford torino | 17.0 | 8 | 302.0 | 140 | 3449 | 10.5 | 70 | 1 |
| 286 | ford ltd landau | 17.6 | 8 | 302.0 | 129 | 3725 | 13.4 | 79 | 1 |
| 342 | plymouth reliant | 30.0 | 4 | 135.0 | 84 | 2385 | 12.9 | 81 | 1 |
| 57 | toyota corona hardtop | 24.0 | 4 | 113.0 | 95 | 2278 | 15.5 | 72 | 3 |
| 376 | mazda glc custom l | 37.0 | 4 | 91.0 | 68 | 2025 | 18.2 | 82 | 3 |
| 116 | pontiac grand prix | 16.0 | 8 | 400.0 | 230 | 4278 | 9.5 | 73 | 1 |
| 228 | ford granada | 18.5 | 6 | 250.0 | 98 | 3525 | 19.0 | 77 | 1 |
| 330 | renault lecar deluxe | 40.9 | 4 | 85.0 | ? | 1835 | 17.3 | 80 | 2 |
| 303 | datsun 210 | 31.8 | 4 | 85.0 | 65 | 2020 | 19.2 | 79 | 3 |
| 120 | volvo 144ea | 19.0 | 4 | 121.0 | 112 | 2868 | 15.5 | 73 | 2 |
| 335 | triumph tr7 coupe | 35.0 | 4 | 122.0 | 88 | 2500 | 15.1 | 80 | 2 |
| 379 | mercury lynx l | 36.0 | 4 | 98.0 | 70 | 2125 | 17.3 | 82 | 1 |
| 301 | plymouth horizon | 34.2 | 4 | 105.0 | 70 | 2200 | 13.2 | 79 | 1 |
| 123 | toyota mark ii | 20.0 | 6 | 156.0 | 122 | 2807 | 13.5 | 73 | 3 |
| 44 | pontiac safari (sw) | 13.0 | 8 | 400.0 | 175 | 5140 | 12.0 | 71 | 1 |
| 156 | pontiac catalina | 16.0 | 8 | 400.0 | 170 | 4668 | 11.5 | 75 | 1 |
| 349 | mazda glc 4 | 34.1 | 4 | 91.0 | 68 | 1985 | 16.0 | 81 | 3 |
| 54 | datsun 1200 | 35.0 | 4 | 72.0 | 69 | 1613 | 18.0 | 71 | 3 |
| 109 | chevrolet vega | 21.0 | 4 | 140.0 | 72 | 2401 | 19.5 | 73 | 1 |
| 176 | amc pacer | 19.0 | 6 | 232.0 | 90 | 3211 | 17.0 | 75 | 1 |
| 315 | amc concord | 24.3 | 4 | 151.0 | 90 | 3003 | 20.1 | 80 | 1 |
| 114 | fiat 124 sport coupe | 26.0 | 4 | 98.0 | 90 | 2265 | 15.5 | 73 | 2 |
| 277 | peugeot 604sl | 16.2 | 6 | 163.0 | 133 | 3410 | 15.8 | 78 | 2 |
| 327 | audi 5000s (diesel) | 36.4 | 5 | 121.0 | 67 | 2950 | 19.9 | 80 | 2 |
| 284 | dodge aspen 6 | 20.6 | 6 | 225.0 | 110 | 3360 | 16.6 | 79 | 1 |
| 133 | chevrolet chevelle malibu classic | 16.0 | 6 | 250.0 | 100 | 3781 | 17.0 | 74 | 1 |
| 162 | amc matador | 15.0 | 6 | 258.0 | 110 | 3730 | 19.0 | 75 | 1 |
| 37 | amc matador | 18.0 | 6 | 232.0 | 100 | 3288 | 15.5 | 71 | 1 |
| 7 | plymouth fury iii | 14.0 | 8 | 440.0 | 215 | 4312 | 8.5 | 70 | 1 |
| 148 | fiat 124 tc | 26.0 | 4 | 116.0 | 75 | 2246 | 14.0 | 74 | 2 |
| 219 | plymouth arrow gs | 25.5 | 4 | 122.0 | 96 | 2300 | 15.5 | 77 | 1 |
| 86 | amc matador | 14.0 | 8 | 304.0 | 150 | 3672 | 11.5 | 73 | 1 |
| 38 | chevrolet impala | 14.0 | 8 | 350.0 | 165 | 4209 | 12.0 | 71 | 1 |
| 169 | amc gremlin | 20.0 | 6 | 232.0 | 100 | 2914 | 16.0 | 75 | 1 |
| 361 | toyota cressida | 25.4 | 6 | 168.0 | 116 | 2900 | 12.6 | 81 | 3 |
| 179 | volvo 244dl | 22.0 | 4 | 121.0 | 98 | 2945 | 14.5 | 75 | 2 |
| 140 | amc matador (sw) | 14.0 | 8 | 304.0 | 150 | 4257 | 15.5 | 74 | 1 |
| 245 | ford fiesta | 36.1 | 4 | 98.0 | 66 | 1800 | 14.4 | 78 | 1 |
| 113 | mercury capri v6 | 21.0 | 6 | 155.0 | 107 | 2472 | 14.0 | 73 | 1 |
| 380 | nissan stanza xe | 36.0 | 4 | 120.0 | 88 | 2160 | 14.5 | 82 | 3 |
| 14 | toyota corona mark ii | 24.0 | 4 | 113.0 | 95 | 2372 | 15.0 | 70 | 3 |
car_data.shape
(398, 9)
# Check the info of the dataframe
car_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 398 entries, 0 to 397 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 car_name 398 non-null object 1 mpg 398 non-null float64 2 cyl 398 non-null int64 3 disp 398 non-null float64 4 hp 398 non-null object 5 wt 398 non-null int64 6 acc 398 non-null float64 7 yr 398 non-null int64 8 origin 398 non-null int64 dtypes: float64(3), int64(4), object(2) memory usage: 28.1+ KB
car_data.describe()
| mpg | cyl | disp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|
| count | 398.000000 | 398.000000 | 398.000000 | 398.000000 | 398.000000 | 398.000000 | 398.000000 |
| mean | 23.514573 | 5.454774 | 193.425879 | 2970.424623 | 15.568090 | 76.010050 | 1.572864 |
| std | 7.815984 | 1.701004 | 104.269838 | 846.841774 | 2.757689 | 3.697627 | 0.802055 |
| min | 9.000000 | 3.000000 | 68.000000 | 1613.000000 | 8.000000 | 70.000000 | 1.000000 |
| 25% | 17.500000 | 4.000000 | 104.250000 | 2223.750000 | 13.825000 | 73.000000 | 1.000000 |
| 50% | 23.000000 | 4.000000 | 148.500000 | 2803.500000 | 15.500000 | 76.000000 | 1.000000 |
| 75% | 29.000000 | 8.000000 | 262.000000 | 3608.000000 | 17.175000 | 79.000000 | 2.000000 |
| max | 46.600000 | 8.000000 | 455.000000 | 5140.000000 | 24.800000 | 82.000000 | 3.000000 |
# Check for missing values
# Replace empty places with NaN
car_data_new = car_data.replace(' ',np.nan)
car_data_new.isnull().sum()*100/len(car_data_new)
car_name 0.0 mpg 0.0 cyl 0.0 disp 0.0 hp 0.0 wt 0.0 acc 0.0 yr 0.0 origin 0.0 dtype: float64
# Check for any missing value in all columns
car_data_new.isnull().values.any()
False
There is no missing values in the dataset therefore no imputation is required for the dataset.
car_data[car_data.duplicated()]
| car_name | mpg | cyl | disp | hp | wt | acc | yr | origin |
|---|
# Count of duplicate rows in the dataframe
car_data.duplicated().sum()
0
There is no duplicate rows in the dataset, therefore no imputation is required.
# Plot the pairplot
sns.pairplot(data=car_data, diag_kind = 'kde', corner = True);
C:\Users\fenuj\anaconda3\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
sns.scatterplot(data=car_data, x = 'wt', y = 'disp', hue = 'cyl');
sns.scatterplot(data=car_data, x = 'wt', y = 'mpg', hue = 'cyl');
car_data[car_data['hp']=="?"]
| car_name | mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|---|
| 32 | ford pinto | 25.0 | 4 | 98.0 | ? | 2046 | 19.0 | 71 | 1 |
| 126 | ford maverick | 21.0 | 6 | 200.0 | ? | 2875 | 17.0 | 74 | 1 |
| 330 | renault lecar deluxe | 40.9 | 4 | 85.0 | ? | 1835 | 17.3 | 80 | 2 |
| 336 | ford mustang cobra | 23.6 | 4 | 140.0 | ? | 2905 | 14.3 | 80 | 1 |
| 354 | renault 18i | 34.5 | 4 | 100.0 | ? | 2320 | 15.8 | 81 | 2 |
| 374 | amc concord dl | 23.0 | 4 | 151.0 | ? | 3035 | 20.5 | 82 | 1 |
# Check for unexpected values in all columns
# Replace ? places with NaN
car_data_new = car_data.replace('?',np.nan)
car_data_new.isnull().sum()*100/len(car_data_new)
car_name 0.000000 mpg 0.000000 cyl 0.000000 disp 0.000000 hp 1.507538 wt 0.000000 acc 0.000000 yr 0.000000 origin 0.000000 dtype: float64
1.5% values in hp column are nan as they had ? entries in some rows. Imputing with mean values can lead to bias due to the presence of outliers. The ideal strategy is to drop them or impute them with the median values.
# Impute with Median values
car_data_new['hp'].fillna((car_data_new['hp'].median()), inplace=True)
# Change datatype of hp to float
car_data_new['hp'] = car_data_new['hp'].astype('float')
car_data_new.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 398 entries, 0 to 397 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 car_name 398 non-null object 1 mpg 398 non-null float64 2 cyl 398 non-null int64 3 disp 398 non-null float64 4 hp 398 non-null float64 5 wt 398 non-null int64 6 acc 398 non-null float64 7 yr 398 non-null int64 8 origin 398 non-null int64 dtypes: float64(4), int64(4), object(1) memory usage: 28.1+ KB
# Plot the pairplot
sns.pairplot(data=car_data_new, diag_kind = 'kde', corner = True, hue = 'cyl');
C:\Users\fenuj\anaconda3\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
# Calculate age of vehicle
# This dataset was used in 1983 so we would subtract year from 83 to get the age
car_data_new['age'] = 83-car_data_new['yr']
car_data_new.head()
| car_name | mpg | cyl | disp | hp | wt | acc | yr | origin | age | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | chevrolet chevelle malibu | 18.0 | 8 | 307.0 | 130.0 | 3504 | 12.0 | 70 | 1 | 13 |
| 1 | buick skylark 320 | 15.0 | 8 | 350.0 | 165.0 | 3693 | 11.5 | 70 | 1 | 13 |
| 2 | plymouth satellite | 18.0 | 8 | 318.0 | 150.0 | 3436 | 11.0 | 70 | 1 | 13 |
| 3 | amc rebel sst | 16.0 | 8 | 304.0 | 150.0 | 3433 | 12.0 | 70 | 1 | 13 |
| 4 | ford torino | 17.0 | 8 | 302.0 | 140.0 | 3449 | 10.5 | 70 | 1 | 13 |
# We create dummy variables from the origin column
one_hot = pd.get_dummies(car_data_new['origin'])
one_hot = one_hot.add_prefix('origin_')
car_data_new = car_data_new.join(one_hot)
car_data_new.head()
| car_name | mpg | cyl | disp | hp | wt | acc | yr | origin | age | origin_1 | origin_2 | origin_3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | chevrolet chevelle malibu | 18.0 | 8 | 307.0 | 130.0 | 3504 | 12.0 | 70 | 1 | 13 | True | False | False |
| 1 | buick skylark 320 | 15.0 | 8 | 350.0 | 165.0 | 3693 | 11.5 | 70 | 1 | 13 | True | False | False |
| 2 | plymouth satellite | 18.0 | 8 | 318.0 | 150.0 | 3436 | 11.0 | 70 | 1 | 13 | True | False | False |
| 3 | amc rebel sst | 16.0 | 8 | 304.0 | 150.0 | 3433 | 12.0 | 70 | 1 | 13 | True | False | False |
| 4 | ford torino | 17.0 | 8 | 302.0 | 140.0 | 3449 | 10.5 | 70 | 1 | 13 | True | False | False |
cars_new = car_data_new.drop(['yr','origin','car_name'], axis =1)
cars_new.head()
| mpg | cyl | disp | hp | wt | acc | age | origin_1 | origin_2 | origin_3 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 18.0 | 8 | 307.0 | 130.0 | 3504 | 12.0 | 13 | True | False | False |
| 1 | 15.0 | 8 | 350.0 | 165.0 | 3693 | 11.5 | 13 | True | False | False |
| 2 | 18.0 | 8 | 318.0 | 150.0 | 3436 | 11.0 | 13 | True | False | False |
| 3 | 16.0 | 8 | 304.0 | 150.0 | 3433 | 12.0 | 13 | True | False | False |
| 4 | 17.0 | 8 | 302.0 | 140.0 | 3449 | 10.5 | 13 | True | False | False |
cars_new.dtypes
# Choose numeric columns from the dataframe
numeric_cols = cars_new.select_dtypes(include=[np.int64, np.float64]).columns
numeric_cols
# Apply zscore to numeric columns
cars_new[numeric_cols] =cars_new[numeric_cols].apply(zscore)
cars_new.head()
| mpg | cyl | disp | hp | wt | acc | age | origin_1 | origin_2 | origin_3 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.706439 | 1.498191 | 1.090604 | 0.673118 | 0.630870 | -1.295498 | 1.627426 | True | False | False |
| 1 | -1.090751 | 1.498191 | 1.503514 | 1.589958 | 0.854333 | -1.477038 | 1.627426 | True | False | False |
| 2 | -0.706439 | 1.498191 | 1.196232 | 1.197027 | 0.550470 | -1.658577 | 1.627426 | True | False | False |
| 3 | -0.962647 | 1.498191 | 1.061796 | 1.197027 | 0.546923 | -1.295498 | 1.627426 | True | False | False |
| 4 | -0.834543 | 1.498191 | 1.042591 | 0.935072 | 0.565841 | -1.840117 | 1.627426 | True | False | False |
cluster_range = range(2,11)
cluster_errors = []
#Create Cluster
for num_clusters in cluster_range:
clusters = KMeans(num_clusters, n_init = 5)
clusters.fit(cars_new)
labels = clusters.labels_
centroids = clusters.cluster_centers_
cluster_errors.append(clusters.inertia_)
clusters_df = pd.DataFrame({"num_clusters": cluster_range, "cluster_errors": cluster_errors})
clusters_df[0:10]
C:\Users\fenuj\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2. warnings.warn( C:\Users\fenuj\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2. warnings.warn( C:\Users\fenuj\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2. warnings.warn( C:\Users\fenuj\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2. warnings.warn( C:\Users\fenuj\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2. warnings.warn( C:\Users\fenuj\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2. warnings.warn( C:\Users\fenuj\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2. warnings.warn( C:\Users\fenuj\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2. warnings.warn( C:\Users\fenuj\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2. warnings.warn(
| num_clusters | cluster_errors | |
|---|---|---|
| 0 | 2 | 1472.940229 |
| 1 | 3 | 1107.980067 |
| 2 | 4 | 892.654455 |
| 3 | 5 | 812.259776 |
| 4 | 6 | 737.234296 |
| 5 | 7 | 681.130615 |
| 6 | 8 | 632.483096 |
| 7 | 9 | 600.013629 |
| 8 | 10 | 564.577972 |
plt.figure(figsize=(12,6))
plt.plot(clusters_df.num_clusters, clusters_df.cluster_errors, marker = "o" );
The bend occurs at the 4th index. So 4 clusters are needed.
plt.figure(figsize=(12,6))
plt.plot(clusters_df.num_clusters, clusters_df.cluster_errors, 'bo-')
# Highlight the possible elbow points in the same figure
plt.plot([4, 5],[892.654455, 807.377856],'r*', markersize=12, label='Possible Elbow Points')
plt.legend(loc = 'best')
plt.show()
kmeans_5 = KMeans(n_clusters=5, n_init = 5, random_state=12345)
kmeans_5.fit(cars_new)
C:\Users\fenuj\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2. warnings.warn(
KMeans(n_clusters=5, n_init=5, random_state=12345)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KMeans(n_clusters=5, n_init=5, random_state=12345)
# Create labels
labels = kmeans_5.labels_
# Calculating silhouette_score
silhouette_score(cars_new,labels)
0.3006563959678084
# Create a model with 4 clusters
kmeans = KMeans(n_clusters=4, n_init = 5, random_state=12345)
kmeans.fit(cars_new)
C:\Users\fenuj\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2. warnings.warn(
KMeans(n_clusters=4, n_init=5, random_state=12345)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KMeans(n_clusters=4, n_init=5, random_state=12345)
# Create labels
labels = kmeans.labels_
# Calculating silhouette_score
silhouette_score(cars_new,labels)
0.3219323876445457
# Create a model with 3 clusters
kmeans_3 = KMeans(n_clusters=4, n_init = 5, random_state=12345)
kmeans_3.fit(cars_new)
C:\Users\fenuj\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2. warnings.warn(
KMeans(n_clusters=4, n_init=5, random_state=12345)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KMeans(n_clusters=4, n_init=5, random_state=12345)
labels = kmeans_3.labels_
# Calculating silhouette_score
silhouette_score(cars_new,labels)
0.3219323876445457
Based on the silhouette score the optimal number of clusters should be 4. There is a sharp drop at 4 in the silhouette score.
labels = kmeans.labels_
counts = np.bincount(labels[labels>=0])
print(counts)
[114 98 94 92]
Distribution looks fine.
# let us check the centers in each group
centroids = kmeans.cluster_centers_
centroid_df = pd.DataFrame(centroids, columns = list(cars_new) )
centroid_df.transpose()
| 0 | 1 | 2 | 3 | |
|---|---|---|---|---|
| mpg | 1.172644 | 0.213034 | -1.163797e+00 | -0.490890 |
| cyl | -0.825340 | -0.868333 | 1.498191e+00 | 0.416907 |
| disp | -0.778107 | -0.823983 | 1.503923e+00 | 0.305280 |
| hp | -0.744883 | -0.553391 | 1.521683e+00 | -0.042274 |
| wt | -0.760687 | -0.769895 | 1.404098e+00 | 0.328074 |
| acc | 0.410654 | 0.286859 | -1.086149e+00 | 0.295340 |
| age | -1.101793 | 0.635472 | 6.883238e-01 | -0.014938 |
| origin_1 | 0.421053 | 0.244898 | 1.000000e+00 | 0.902174 |
| origin_2 | 0.184211 | 0.459184 | -2.498002e-16 | 0.043478 |
| origin_3 | 0.394737 | 0.295918 | 8.326673e-17 | 0.054348 |
# Add cluster number to original cars data
predictions = kmeans.predict(cars_new)
predictions
car_data["group"] = predictions
car_data['group'] = car_data['group'].astype('category')
car_data.dtypes
car_name object mpg float64 cyl int64 disp float64 hp object wt int64 acc float64 yr int64 origin int64 group category dtype: object
car_data.head()
| car_name | mpg | cyl | disp | hp | wt | acc | yr | origin | group | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | chevrolet chevelle malibu | 18.0 | 8 | 307.0 | 130 | 3504 | 12.0 | 70 | 1 | 2 |
| 1 | buick skylark 320 | 15.0 | 8 | 350.0 | 165 | 3693 | 11.5 | 70 | 1 | 2 |
| 2 | plymouth satellite | 18.0 | 8 | 318.0 | 150 | 3436 | 11.0 | 70 | 1 | 2 |
| 3 | amc rebel sst | 16.0 | 8 | 304.0 | 150 | 3433 | 12.0 | 70 | 1 | 2 |
| 4 | ford torino | 17.0 | 8 | 302.0 | 140 | 3449 | 10.5 | 70 | 1 | 2 |
car_data['group'].value_counts().sort_index()
group 0 114 1 98 2 94 3 92 Name: count, dtype: int64
# Group the clusters
df_analysis = (car_data.groupby(['group'],axis=0)).head()
df_analysis
| car_name | mpg | cyl | disp | hp | wt | acc | yr | origin | group | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | chevrolet chevelle malibu | 18.0 | 8 | 307.0 | 130 | 3504 | 12.0 | 70 | 1 | 2 |
| 1 | buick skylark 320 | 15.0 | 8 | 350.0 | 165 | 3693 | 11.5 | 70 | 1 | 2 |
| 2 | plymouth satellite | 18.0 | 8 | 318.0 | 150 | 3436 | 11.0 | 70 | 1 | 2 |
| 3 | amc rebel sst | 16.0 | 8 | 304.0 | 150 | 3433 | 12.0 | 70 | 1 | 2 |
| 4 | ford torino | 17.0 | 8 | 302.0 | 140 | 3449 | 10.5 | 70 | 1 | 2 |
| 14 | toyota corona mark ii | 24.0 | 4 | 113.0 | 95 | 2372 | 15.0 | 70 | 3 | 1 |
| 15 | plymouth duster | 22.0 | 6 | 198.0 | 95 | 2833 | 15.5 | 70 | 1 | 3 |
| 16 | amc hornet | 18.0 | 6 | 199.0 | 97 | 2774 | 15.5 | 70 | 1 | 3 |
| 17 | ford maverick | 21.0 | 6 | 200.0 | 85 | 2587 | 16.0 | 70 | 1 | 3 |
| 18 | datsun pl510 | 27.0 | 4 | 97.0 | 88 | 2130 | 14.5 | 70 | 3 | 1 |
| 19 | volkswagen 1131 deluxe sedan | 26.0 | 4 | 97.0 | 46 | 1835 | 20.5 | 70 | 2 | 1 |
| 20 | peugeot 504 | 25.0 | 4 | 110.0 | 87 | 2672 | 17.5 | 70 | 2 | 1 |
| 21 | audi 100 ls | 24.0 | 4 | 107.0 | 90 | 2430 | 14.5 | 70 | 2 | 1 |
| 24 | amc gremlin | 21.0 | 6 | 199.0 | 90 | 2648 | 15.0 | 70 | 1 | 3 |
| 33 | amc gremlin | 19.0 | 6 | 232.0 | 100 | 2634 | 13.0 | 71 | 1 | 3 |
| 195 | chevrolet chevette | 29.0 | 4 | 85.0 | 52 | 2035 | 22.2 | 76 | 1 | 0 |
| 198 | honda civic | 33.0 | 4 | 91.0 | 53 | 1795 | 17.4 | 76 | 3 | 0 |
| 204 | datsun b-210 | 32.0 | 4 | 85.0 | 70 | 1990 | 17.0 | 76 | 3 | 0 |
| 216 | honda accord cvcc | 31.5 | 4 | 98.0 | 68 | 2045 | 18.5 | 77 | 3 | 0 |
| 217 | buick opel isuzu deluxe | 30.0 | 4 | 111.0 | 80 | 2155 | 14.8 | 77 | 1 | 0 |
# Plot pairplot with hue
sns.pairplot(data=car_data, hue='group', corner = True, diag_kind = 'kde');
C:\Users\fenuj\anaconda3\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
# Visualize the centers
cars_new["group"] = predictions
cars_new.boxplot(by = 'group', layout=(3,4), figsize=(15, 10), patch_artist=True);
# Create a new record dataframe
new_record = [[0.241531,-0.856321,0.370411,-1.448713,-0.888443,1.645441,0.544290,0,0,1]]
new_record = pd.DataFrame(new_record)
new_record
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.241531 | -0.856321 | 0.370411 | -1.448713 | -0.888443 | 1.645441 | 0.54429 | 0 | 0 | 1 |
kmeans.predict(new_record)
C:\Users\fenuj\anaconda3\Lib\site-packages\sklearn\base.py:464: UserWarning: X does not have valid feature names, but KMeans was fitted with feature names warnings.warn(
array([1])
The model predicted group for the new record as 1
vehicle = pd.read_csv('vehicle.csv')
vehicle.head()
| compactness | circularity | distance_circularity | radius_ratio | pr.axis_aspect_ratio | max.length_aspect_ratio | scatter_ratio | elongatedness | pr.axis_rectangularity | max.length_rectangularity | scaled_variance | scaled_variance.1 | scaled_radius_of_gyration | scaled_radius_of_gyration.1 | skewness_about | skewness_about.1 | skewness_about.2 | hollows_ratio | class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 95 | 48.0 | 83.0 | 178.0 | 72.0 | 10 | 162.0 | 42.0 | 20.0 | 159 | 176.0 | 379.0 | 184.0 | 70.0 | 6.0 | 16.0 | 187.0 | 197 | van |
| 1 | 91 | 41.0 | 84.0 | 141.0 | 57.0 | 9 | 149.0 | 45.0 | 19.0 | 143 | 170.0 | 330.0 | 158.0 | 72.0 | 9.0 | 14.0 | 189.0 | 199 | van |
| 2 | 104 | 50.0 | 106.0 | 209.0 | 66.0 | 10 | 207.0 | 32.0 | 23.0 | 158 | 223.0 | 635.0 | 220.0 | 73.0 | 14.0 | 9.0 | 188.0 | 196 | car |
| 3 | 93 | 41.0 | 82.0 | 159.0 | 63.0 | 9 | 144.0 | 46.0 | 19.0 | 143 | 160.0 | 309.0 | 127.0 | 63.0 | 6.0 | 10.0 | 199.0 | 207 | van |
| 4 | 85 | 44.0 | 70.0 | 205.0 | 103.0 | 52 | 149.0 | 45.0 | 19.0 | 144 | 241.0 | 325.0 | 188.0 | 127.0 | 9.0 | 11.0 | 180.0 | 183 | bus |
vehicle.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 846 entries, 0 to 845 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 compactness 846 non-null int64 1 circularity 841 non-null float64 2 distance_circularity 842 non-null float64 3 radius_ratio 840 non-null float64 4 pr.axis_aspect_ratio 844 non-null float64 5 max.length_aspect_ratio 846 non-null int64 6 scatter_ratio 845 non-null float64 7 elongatedness 845 non-null float64 8 pr.axis_rectangularity 843 non-null float64 9 max.length_rectangularity 846 non-null int64 10 scaled_variance 843 non-null float64 11 scaled_variance.1 844 non-null float64 12 scaled_radius_of_gyration 844 non-null float64 13 scaled_radius_of_gyration.1 842 non-null float64 14 skewness_about 840 non-null float64 15 skewness_about.1 845 non-null float64 16 skewness_about.2 845 non-null float64 17 hollows_ratio 846 non-null int64 18 class 846 non-null object dtypes: float64(14), int64(4), object(1) memory usage: 125.7+ KB
# Replace empty places with NaN
vehicle1 = vehicle.replace([' ','?'],np.nan)
# Calculate percentage of null values for each column
missing = vehicle1.isnull().sum()*100/len(vehicle1)
missing
compactness 0.000000 circularity 0.591017 distance_circularity 0.472813 radius_ratio 0.709220 pr.axis_aspect_ratio 0.236407 max.length_aspect_ratio 0.000000 scatter_ratio 0.118203 elongatedness 0.118203 pr.axis_rectangularity 0.354610 max.length_rectangularity 0.000000 scaled_variance 0.354610 scaled_variance.1 0.236407 scaled_radius_of_gyration 0.236407 scaled_radius_of_gyration.1 0.472813 skewness_about 0.709220 skewness_about.1 0.118203 skewness_about.2 0.118203 hollows_ratio 0.000000 class 0.000000 dtype: float64
# Get List of Columns with missing values
missing_val_col = vehicle.columns[vehicle.isnull().any()].tolist()
missing_val_col
['circularity', 'distance_circularity', 'radius_ratio', 'pr.axis_aspect_ratio', 'scatter_ratio', 'elongatedness', 'pr.axis_rectangularity', 'scaled_variance', 'scaled_variance.1', 'scaled_radius_of_gyration', 'scaled_radius_of_gyration.1', 'skewness_about', 'skewness_about.1', 'skewness_about.2']
for col in missing_val_col:
vehicle1[col] = vehicle1[col].fillna(vehicle1[col].median())
vehicle1.isnull().sum()*100/len(vehicle1)
compactness 0.0 circularity 0.0 distance_circularity 0.0 radius_ratio 0.0 pr.axis_aspect_ratio 0.0 max.length_aspect_ratio 0.0 scatter_ratio 0.0 elongatedness 0.0 pr.axis_rectangularity 0.0 max.length_rectangularity 0.0 scaled_variance 0.0 scaled_variance.1 0.0 scaled_radius_of_gyration 0.0 scaled_radius_of_gyration.1 0.0 skewness_about 0.0 skewness_about.1 0.0 skewness_about.2 0.0 hollows_ratio 0.0 class 0.0 dtype: float64
# Plot pie chart with percentage values
vehicle1.groupby('class').size().plot(kind='pie', autopct='%1.1f%%');
vehicle1[vehicle1.duplicated()]
| compactness | circularity | distance_circularity | radius_ratio | pr.axis_aspect_ratio | max.length_aspect_ratio | scatter_ratio | elongatedness | pr.axis_rectangularity | max.length_rectangularity | scaled_variance | scaled_variance.1 | scaled_radius_of_gyration | scaled_radius_of_gyration.1 | skewness_about | skewness_about.1 | skewness_about.2 | hollows_ratio | class |
|---|
vehicle1.duplicated().sum()
0
No duplicate rows are there and therefore no imputation is needed.
# Plot pairplot of vehicle 1
sns.pairplot(data=vehicle1, hue = 'class', diag_kind = 'kde', corner = True);
C:\Users\fenuj\anaconda3\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
vehicle1.head()
| compactness | circularity | distance_circularity | radius_ratio | pr.axis_aspect_ratio | max.length_aspect_ratio | scatter_ratio | elongatedness | pr.axis_rectangularity | max.length_rectangularity | scaled_variance | scaled_variance.1 | scaled_radius_of_gyration | scaled_radius_of_gyration.1 | skewness_about | skewness_about.1 | skewness_about.2 | hollows_ratio | class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 95 | 48.0 | 83.0 | 178.0 | 72.0 | 10 | 162.0 | 42.0 | 20.0 | 159 | 176.0 | 379.0 | 184.0 | 70.0 | 6.0 | 16.0 | 187.0 | 197 | van |
| 1 | 91 | 41.0 | 84.0 | 141.0 | 57.0 | 9 | 149.0 | 45.0 | 19.0 | 143 | 170.0 | 330.0 | 158.0 | 72.0 | 9.0 | 14.0 | 189.0 | 199 | van |
| 2 | 104 | 50.0 | 106.0 | 209.0 | 66.0 | 10 | 207.0 | 32.0 | 23.0 | 158 | 223.0 | 635.0 | 220.0 | 73.0 | 14.0 | 9.0 | 188.0 | 196 | car |
| 3 | 93 | 41.0 | 82.0 | 159.0 | 63.0 | 9 | 144.0 | 46.0 | 19.0 | 143 | 160.0 | 309.0 | 127.0 | 63.0 | 6.0 | 10.0 | 199.0 | 207 | van |
| 4 | 85 | 44.0 | 70.0 | 205.0 | 103.0 | 52 | 149.0 | 45.0 | 19.0 | 144 | 241.0 | 325.0 | 188.0 | 127.0 | 9.0 | 11.0 | 180.0 | 183 | bus |
# The target variable is class
# All others are independent variables
# X is the dataframe for independent variables
X = vehicle1.drop(['class'], axis = 1)
X.head()
| compactness | circularity | distance_circularity | radius_ratio | pr.axis_aspect_ratio | max.length_aspect_ratio | scatter_ratio | elongatedness | pr.axis_rectangularity | max.length_rectangularity | scaled_variance | scaled_variance.1 | scaled_radius_of_gyration | scaled_radius_of_gyration.1 | skewness_about | skewness_about.1 | skewness_about.2 | hollows_ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 95 | 48.0 | 83.0 | 178.0 | 72.0 | 10 | 162.0 | 42.0 | 20.0 | 159 | 176.0 | 379.0 | 184.0 | 70.0 | 6.0 | 16.0 | 187.0 | 197 |
| 1 | 91 | 41.0 | 84.0 | 141.0 | 57.0 | 9 | 149.0 | 45.0 | 19.0 | 143 | 170.0 | 330.0 | 158.0 | 72.0 | 9.0 | 14.0 | 189.0 | 199 |
| 2 | 104 | 50.0 | 106.0 | 209.0 | 66.0 | 10 | 207.0 | 32.0 | 23.0 | 158 | 223.0 | 635.0 | 220.0 | 73.0 | 14.0 | 9.0 | 188.0 | 196 |
| 3 | 93 | 41.0 | 82.0 | 159.0 | 63.0 | 9 | 144.0 | 46.0 | 19.0 | 143 | 160.0 | 309.0 | 127.0 | 63.0 | 6.0 | 10.0 | 199.0 | 207 |
| 4 | 85 | 44.0 | 70.0 | 205.0 | 103.0 | 52 | 149.0 | 45.0 | 19.0 | 144 | 241.0 | 325.0 | 188.0 | 127.0 | 9.0 | 11.0 | 180.0 | 183 |
# Y is the target variable
Y = vehicle1['class']
Y.head()
0 van 1 van 2 car 3 van 4 bus Name: class, dtype: object
# Split X and Y into training and test set in 80:20 ratio
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=1)
# Print shape of the data split
X_train.shape, X_test.shape, y_train.shape, y_test.shape
((676, 18), (170, 18), (676,), (170,))
scaler = StandardScaler()
# Perform standardization
normalized_x_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
# Display header of normalized X_train
normalized_x_train.head()
| compactness | circularity | distance_circularity | radius_ratio | pr.axis_aspect_ratio | max.length_aspect_ratio | scatter_ratio | elongatedness | pr.axis_rectangularity | max.length_rectangularity | scaled_variance | scaled_variance.1 | scaled_radius_of_gyration | scaled_radius_of_gyration.1 | skewness_about | skewness_about.1 | skewness_about.2 | hollows_ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.581861 | 0.342899 | 0.042356 | 4.503746 | 8.714319 | 8.151668 | -0.342929 | 0.278004 | -0.237311 | 1.033734 | 1.248347 | -0.439164 | 0.012915 | 3.160152 | -1.295902 | 0.160019 | -0.797977 | -0.216177 |
| 1 | -1.550674 | -0.148202 | -0.649855 | -0.894152 | -0.214036 | -0.541322 | -0.492452 | 0.405501 | -0.621097 | -0.150562 | -0.290095 | -0.534559 | 0.074077 | 1.090757 | -1.088803 | 0.270625 | -1.125302 | -1.280920 |
| 2 | 0.023647 | 0.342899 | 0.168212 | 4.828210 | 9.325850 | 8.358643 | -0.432643 | 0.278004 | -0.621097 | 0.476418 | 4.105453 | -0.495278 | 0.349302 | 8.074967 | 1.189289 | -0.393012 | -0.143326 | 0.050008 |
| 3 | 0.992460 | 1.488802 | 1.112135 | -0.186231 | -1.070179 | 0.286582 | 1.301821 | -1.251960 | 1.297833 | 1.730378 | 0.934379 | 1.272334 | 0.777430 | 0.444071 | 1.189289 | 1.597899 | -0.306989 | -0.083085 |
| 4 | 0.386952 | 1.652502 | 0.860422 | 0.020246 | -0.947873 | 0.286582 | 1.391534 | -1.251960 | 1.297833 | 1.730378 | 0.934379 | 1.362117 | 1.297301 | 0.314733 | -1.295902 | -0.946043 | -0.797977 | -0.349270 |
normalized_x_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)
# Display header of normalized X_test
normalized_x_test.head()
| compactness | circularity | distance_circularity | radius_ratio | pr.axis_aspect_ratio | max.length_aspect_ratio | scatter_ratio | elongatedness | pr.axis_rectangularity | max.length_rectangularity | scaled_variance | scaled_variance.1 | scaled_radius_of_gyration | scaled_radius_of_gyration.1 | skewness_about | skewness_about.1 | skewness_about.2 | hollows_ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.460760 | 0.179199 | -0.461070 | -1.071132 | -0.825567 | 0.493558 | -0.283120 | 0.278004 | -0.237311 | 0.894405 | -0.509872 | -0.411106 | -0.139988 | 0.573408 | -1.295902 | 0.381231 | -1.125302 | -0.482363 |
| 1 | 1.840172 | 1.161401 | 0.797494 | 0.580684 | -0.458648 | -0.955274 | 1.720484 | -1.506954 | 1.681619 | 0.685412 | 2.284440 | 1.872761 | 1.266720 | 1.220094 | -1.295902 | 0.713050 | -0.306989 | -1.280920 |
| 2 | 0.750257 | -0.639303 | -0.461070 | 1.052631 | 1.131333 | -0.748298 | 0.195353 | -0.614475 | 0.146475 | -0.707877 | 0.463428 | 0.206155 | -0.751600 | -0.073278 | 0.153793 | 0.713050 | 1.329638 | 0.183101 |
| 3 | 0.750257 | 0.670300 | -0.146429 | 1.082128 | 1.009026 | -0.541322 | 0.404684 | -0.741972 | 0.146475 | 0.546083 | 0.526221 | 0.419391 | 0.685689 | 0.056059 | -0.674604 | 0.049413 | 1.493301 | 0.449287 |
| 4 | 0.144749 | -0.475603 | 0.860422 | 0.816657 | 0.397495 | 0.079606 | 0.255162 | -0.486978 | 0.146475 | -0.498884 | 0.306444 | 0.178098 | -0.812761 | -0.719964 | -1.088803 | 1.819112 | 0.674987 | 0.582379 |
# Building a Support Vector Machine on train data
svc1 = SVC(random_state = 111)
# Fit the training data
svc1.fit(normalized_x_train, y_train)
# Make predictions on the test data
svc_y_pred = svc1.predict(normalized_x_test)
# check the accuracy on the training & test set
print("Training accuracy is: ",svc1.score(normalized_x_train, y_train))
print("Test accuracy is: ",svc1.score(normalized_x_test, y_test))
Training accuracy is: 0.9807692307692307 Test accuracy is: 0.9588235294117647
svc_y_pred_train = svc1.predict(normalized_x_train)
# Classification Report
# Create list of classes in the target variable class
target_names = ['car','bus','van']
# Print Classification Report for training data
print(classification_report(y_train, svc_y_pred_train, target_names=target_names))
precision recall f1-score support
car 0.99 0.98 0.98 181
bus 0.99 0.99 0.99 339
van 0.96 0.97 0.96 156
accuracy 0.98 676
macro avg 0.98 0.98 0.98 676
weighted avg 0.98 0.98 0.98 676
print("Confusion Matrix for Training Data")
# Calculate & Create the confusion matrix
cm=metrics.confusion_matrix(y_train, svc_y_pred_train, labels=['car','bus','van'])
df_cm = pd.DataFrame(cm, index = [i for i in ['car','bus','van']],
columns = [i for i in ['car','bus','van']])
# Import plotting library
#import matplotlib.pyplot as plt
#%matplotlib inline
# Plot the confusion matrix
plt.figure(figsize = (7,5));
sns.heatmap(df_cm, annot=True, fmt = 'g');
Confusion Matrix for Training Data
# Create PCA model with 10 components
pca = PCA(n_components=10)
# Fit the training data
pca.fit(normalized_x_train)
# Create reduced dimension data with applied PCA
#data_reduced = pca.fit_transform(normalized_x_train)
#data_reduced.transpose()
PCA(n_components=10)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA(n_components=10)
cov_matrix = np.cov(normalized_x_train.T) # the relevanat covariance matrix
print('Covariance Matrix \n%s', cov_matrix)
#generating the eigen values and the eigen vectors
e_vals, e_vecs = np.linalg.eig(cov_matrix)
print('Eigenvectors \n%s' %e_vecs)
print('\nEigenvalues \n%s' %e_vals)
Covariance Matrix %s [[ 1.00148148 0.68288411 0.79149453 0.68221511 0.07748053 0.13316939 0.8106164 -0.78343778 0.81335796 0.67885892 0.75201699 0.81156943 0.58192296 -0.25159836 0.229138 0.15275846 0.29866555 0.36744677] [ 0.68288411 1.00148148 0.79503591 0.61942336 0.14679522 0.23267596 0.8469733 -0.82379955 0.84071063 0.96173383 0.79377977 0.83368356 0.92369871 0.039593 0.1429656 -0.00771439 -0.09878982 0.05325642] [ 0.79149453 0.79503591 1.00148148 0.76657766 0.14923539 0.24461215 0.90538129 -0.91396383 0.89409918 0.77803025 0.85775358 0.88577319 0.7081183 -0.23534514 0.11472585 0.27280808 0.1599696 0.34671282] [ 0.68221511 0.61942336 0.76657766 1.00148148 0.66226494 0.46031202 0.72605683 -0.7809903 0.70097597 0.56768292 0.79376668 0.70900597 0.5329057 -0.16148913 0.04224024 0.17440558 0.38355712 0.47782313] [ 0.07748053 0.14679522 0.14923539 0.66226494 1.00148148 0.6777896 0.08645814 -0.16380288 0.06359422 0.11683986 0.27861769 0.07229896 0.11449509 0.18856533 -0.05938367 -0.03736686 0.23347194 0.26615263] [ 0.13316939 0.23267596 0.24461215 0.46031202 0.6777896 1.00148148 0.14518814 -0.16192652 0.13824765 0.28198859 0.32373762 0.12309283 0.1694013 0.32646364 0.0037418 0.03499535 -0.01497029 0.14668507] [ 0.8106164 0.8469733 0.90538129 0.72605683 0.08645814 0.14518814 1.00148148 -0.97272244 0.99092095 0.81014408 0.94429769 0.99373094 0.80242316 -0.03473277 0.06978288 0.21637841 0.00570827 0.11985481] [-0.78343778 -0.82379955 -0.91396383 -0.7809903 -0.16380288 -0.16192652 -0.97272244 1.00148148 -0.95083517 -0.78012071 -0.93138543 -0.95456642 -0.77003389 0.10939953 -0.04469593 -0.19075247 -0.11642453 -0.22217584] [ 0.81335796 0.84071063 0.89409918 0.70097597 0.06359422 0.13824765 0.99092095 -0.95083517 1.00148148 0.81007235 0.9302114 0.98893917 0.7978503 -0.02473253 0.08070738 0.21883232 -0.01722049 0.10042231] [ 0.67885892 0.96173383 0.77803025 0.56768292 0.11683986 0.28198859 0.81014408 -0.78012071 0.81007235 1.00148148 0.7447106 0.79470382 0.86717392 0.02912291 0.13100008 0.0037179 -0.09937783 0.08046519] [ 0.75201699 0.79377977 0.85775358 0.79376668 0.27861769 0.32373762 0.94429769 -0.93138543 0.9302114 0.7447106 1.00148148 0.94154857 0.77989112 0.12826151 0.0311848 0.19396764 0.01052155 0.08340574] [ 0.81156943 0.83368356 0.88577319 0.70900597 0.07229896 0.12309283 0.99373094 -0.95456642 0.98893917 0.79470382 0.94154857 1.00148148 0.79769978 -0.02015901 0.07366951 0.20354531 0.00429328 0.10087841] [ 0.58192296 0.92369871 0.7081183 0.5329057 0.11449509 0.1694013 0.80242316 -0.77003389 0.7978503 0.86717392 0.77989112 0.79769978 1.00148148 0.18488853 0.17095929 -0.05958064 -0.22672195 -0.12092459] [-0.25159836 0.039593 -0.23534514 -0.16148913 0.18856533 0.32646364 -0.03473277 0.10939953 -0.02473253 0.02912291 0.12826151 -0.02015901 0.18488853 1.00148148 -0.08214592 -0.1372034 -0.73498586 -0.78839684] [ 0.229138 0.1429656 0.11472585 0.04224024 -0.05938367 0.0037418 0.06978288 -0.04469593 0.08070738 0.13100008 0.0311848 0.07366951 0.17095929 -0.08214592 1.00148148 -0.05983731 0.11155639 0.08850425] [ 0.15275846 -0.00771439 0.27280808 0.17440558 -0.03736686 0.03499535 0.21637841 -0.19075247 0.21883232 0.0037179 0.19396764 0.20354531 -0.05958064 -0.1372034 -0.05983731 1.00148148 0.08556189 0.21122894] [ 0.29866555 -0.09878982 0.1599696 0.38355712 0.23347194 -0.01497029 0.00570827 -0.11642453 -0.01722049 -0.09937783 0.01052155 0.00429328 -0.22672195 -0.73498586 0.11155639 0.08556189 1.00148148 0.89373486] [ 0.36744677 0.05325642 0.34671282 0.47782313 0.26615263 0.14668507 0.11985481 -0.22217584 0.10042231 0.08046519 0.08340574 0.10087841 -0.12092459 -0.78839684 0.08850425 0.21122894 0.89373486 1.00148148]] Eigenvectors [[ 2.74887804e-01 -1.27067267e-01 -1.13153095e-01 8.43989214e-02 6.81815188e-02 -1.66809334e-01 -4.84757081e-01 -5.71858763e-01 -4.34343247e-01 -2.55711182e-01 4.54788822e-02 -2.02573693e-02 1.27744588e-02 -1.60764021e-01 7.03912028e-03 -7.05946627e-02 7.31532295e-02 -3.10498985e-02] [ 2.93598362e-01 1.22993621e-01 -2.64570883e-02 1.79769453e-01 -9.55342399e-02 3.29925173e-01 2.13804816e-01 -1.94385826e-01 8.98200406e-03 1.15512255e-01 9.19333295e-03 2.14358732e-01 1.18102496e-02 -9.70704179e-02 -5.82153709e-02 4.86068118e-01 1.23057480e-01 -5.83408349e-01] [ 3.05710784e-01 -7.95036214e-02 -5.42799155e-02 -6.52796581e-02 4.60475719e-02 1.32013719e-01 -6.05207386e-02 4.20042179e-01 -1.77238673e-01 -2.35459651e-01 6.98670035e-01 5.19904821e-02 -1.41424922e-02 2.41314730e-01 3.46313305e-02 -1.25135853e-01 -1.12269556e-01 -1.66652208e-01] [ 2.66494751e-01 -1.87492236e-01 2.80987180e-01 -3.62176016e-02 -4.15516188e-02 -2.32826110e-01 2.03503945e-01 6.68464303e-02 -2.56069187e-01 -2.86934441e-02 -8.56938308e-02 1.67318885e-01 4.73846993e-02 -5.07291262e-02 3.59045399e-02 4.68682766e-01 -5.21607921e-01 3.34910185e-01] [ 7.63695461e-02 -1.13499132e-01 6.35258685e-01 3.85963678e-02 -3.89520235e-02 -1.89956398e-01 4.08050361e-01 -1.16101923e-01 -2.81765665e-01 1.08085283e-01 3.65638883e-02 -1.04654579e-01 -3.53293207e-02 9.22681227e-02 -9.54415194e-03 -3.46483012e-01 3.07903344e-01 -1.74887085e-01] [ 9.14080064e-02 1.05931056e-02 5.91630587e-01 2.65059332e-02 1.91874409e-01 3.83334115e-01 -5.14787649e-01 2.14297404e-01 1.37132583e-01 -1.11804725e-01 -2.82613846e-01 1.28594398e-01 1.15714518e-02 -5.76496915e-02 2.82831948e-02 -5.84601530e-02 -6.77272813e-02 -7.83445423e-02] [ 3.17432535e-01 4.87960108e-02 -9.72733341e-02 -9.52818938e-02 -9.09269747e-03 -1.26282135e-01 -5.07047111e-02 1.07921512e-01 4.10332068e-02 1.65888561e-01 -1.70996939e-01 -1.01154495e-01 -8.47067937e-01 -2.79317201e-02 2.28832292e-01 -5.10789633e-02 -3.34035645e-02 -7.07011122e-02] [-3.14702841e-01 1.25504389e-02 5.85105702e-02 8.29960833e-02 7.46152979e-02 1.39418196e-01 -4.29347655e-02 -2.26917503e-01 -1.48736570e-01 -1.69528727e-01 -6.62081436e-02 -1.04806072e-02 -2.42313671e-01 7.91164112e-01 3.46533932e-02 2.64313089e-01 -1.71008207e-03 3.27913374e-02] [ 3.14288641e-01 5.95390856e-02 -1.09355924e-01 -9.20996470e-02 9.73639586e-03 -1.14925148e-01 -8.65025750e-02 7.29634398e-02 -1.31071614e-02 1.87845251e-01 -2.61662912e-01 -2.15709328e-01 1.15004709e-01 3.06611198e-01 -7.16622954e-01 -1.31058879e-01 -2.00294064e-01 -1.50309675e-01] [ 2.83602417e-01 1.14355213e-01 -2.25222478e-02 1.77954567e-01 -7.86995866e-02 4.79335611e-01 4.24117361e-02 -2.55837707e-01 -2.31056717e-02 4.54071195e-01 1.46924172e-01 1.04767160e-01 1.54151859e-02 1.13428590e-01 3.86462955e-02 -2.36753405e-01 -4.14374125e-02 5.09377756e-01] [ 3.08902344e-01 6.74315567e-02 7.18074725e-02 -1.16869544e-01 1.09806512e-02 -2.48438454e-01 -8.47377487e-02 7.36333737e-02 2.87596443e-01 -1.26145244e-01 7.16268655e-02 2.38567962e-01 -4.12711670e-04 1.30265364e-01 -1.77537589e-01 2.64620808e-01 6.35722512e-01 3.51350810e-01] [ 3.13958372e-01 5.58781407e-02 -1.07366649e-01 -9.05284271e-02 -1.08869593e-02 -1.73767958e-01 -6.93872781e-02 5.47468414e-02 7.33212159e-02 1.53214844e-01 -2.44411641e-01 -1.44550139e-01 4.51347824e-01 3.35309426e-01 6.26304331e-01 -2.74906384e-02 3.05158450e-02 -1.44517201e-01] [ 2.72633368e-01 2.12452994e-01 -3.43989493e-02 1.95977639e-01 -6.66772210e-02 1.53294813e-01 3.59780335e-01 -1.08149411e-01 2.33416486e-01 -6.91358276e-01 -1.91679069e-01 -1.91473512e-01 -1.14854568e-02 -1.11571660e-02 1.76816260e-02 -1.97798841e-01 -1.15988288e-01 1.13365415e-01] [-2.19105757e-02 4.84248574e-01 2.97674980e-01 -6.26904893e-02 1.52631977e-01 -2.39078478e-01 -1.12357437e-01 -3.09431745e-01 3.46361322e-01 1.07538057e-01 4.20637059e-01 -2.96818349e-01 -6.82088498e-03 -2.75070733e-02 1.60604884e-02 1.39828121e-01 -2.52961938e-01 -4.98548049e-02] [ 3.98587254e-02 -4.87290824e-02 -1.07633480e-01 6.20639359e-01 7.26120428e-01 -1.66971301e-01 1.00100563e-01 1.44706891e-01 7.91669469e-03 1.04601065e-01 -1.27666494e-02 2.35930261e-03 3.47560940e-03 -1.40084255e-02 -1.27082080e-03 1.33655809e-02 3.27477087e-02 2.31190041e-02] [ 5.91752319e-02 -1.31170166e-01 -7.19761018e-02 -6.58191724e-01 6.05097597e-01 2.22505207e-01 2.51952658e-01 -2.25018279e-01 2.35336275e-02 -3.40454641e-02 -3.91275050e-02 7.12342751e-02 1.35761595e-02 -3.70797145e-02 8.24785653e-03 -3.34884593e-02 -3.45720415e-03 -7.20102115e-03] [ 3.12279837e-02 -5.42112290e-01 1.67825659e-02 1.08359821e-01 -9.66305813e-02 -1.67345364e-01 -1.16935224e-02 -2.71168190e-01 5.49713377e-01 1.39548674e-02 9.04611778e-02 3.59541075e-01 -4.53314926e-02 1.49150209e-01 -6.08753447e-03 -2.07684384e-01 -2.22511801e-01 -1.59028516e-01] [ 7.54738322e-02 -5.42521204e-01 4.62559903e-02 4.83175141e-02 -3.73237567e-02 2.28774256e-01 -3.62775036e-02 8.39594685e-04 1.61964985e-01 4.32736230e-02 9.51719663e-02 -7.00486388e-01 4.62921363e-03 -6.62744179e-02 -1.09939345e-02 2.83505571e-01 1.39169026e-01 9.12720786e-02]] Eigenvalues [9.37171511e+00 2.99299982e+00 1.99085013e+00 1.20036900e+00 8.91779285e-01 5.28301471e-01 3.55456828e-01 2.16992418e-01 1.62052565e-01 8.75863441e-02 6.47609771e-02 4.64322045e-02 3.49560007e-03 3.52929241e-02 1.07622023e-02 2.84079329e-02 1.76254890e-02 2.17863662e-02]
tot = sum(e_vals)
var_exp = [( i /tot ) * 100 for i in sorted(e_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)
print("Cumulative Variance Explained", cum_var_exp)
Cumulative Variance Explained [ 51.98806456 68.59124406 79.63516122 86.2940129 91.24101338 94.17167982 96.14351873 97.34724886 98.2462091 98.73208009 99.09133107 99.34890617 99.54468792 99.70227631 99.82313263 99.92090716 99.98060873 100. ]
print('Eigen Values are: ')
print(pca.explained_variance_)
Eigen Values are: [9.37171511 2.99299982 1.99085013 1.200369 0.89177929 0.52830147 0.35545683 0.21699242 0.16205257 0.08758634]
print('Eigen Vectors are: ')
print(pca.components_)
Eigen Vectors are: [[ 0.2748878 0.29359836 0.30571078 0.26649475 0.07636955 0.09140801 0.31743253 -0.31470284 0.31428864 0.28360242 0.30890234 0.31395837 0.27263337 -0.02191058 0.03985873 0.05917523 0.03122798 0.07547383] [-0.12706727 0.12299362 -0.07950362 -0.18749224 -0.11349913 0.01059311 0.04879601 0.01255044 0.05953909 0.11435521 0.06743156 0.05587814 0.21245299 0.48424857 -0.04872908 -0.13117017 -0.54211229 -0.5425212 ] [-0.1131531 -0.02645709 -0.05427992 0.28098718 0.63525869 0.59163059 -0.09727333 0.05851057 -0.10935592 -0.02252225 0.07180747 -0.10736665 -0.03439895 0.29767498 -0.10763348 -0.0719761 0.01678257 0.04625599] [ 0.08439892 0.17976945 -0.06527966 -0.0362176 0.03859637 0.02650593 -0.09528189 0.08299608 -0.09209965 0.17795457 -0.11686954 -0.09052843 0.19597764 -0.06269049 0.62063936 -0.65819172 0.10835982 0.04831751] [ 0.06818152 -0.09553424 0.04604757 -0.04155162 -0.03895202 0.19187441 -0.0090927 0.0746153 0.0097364 -0.07869959 0.01098065 -0.01088696 -0.06667722 0.15263198 0.72612043 0.6050976 -0.09663058 -0.03732376] [ 0.16680933 -0.32992517 -0.13201372 0.23282611 0.1899564 -0.38333412 0.12628214 -0.1394182 0.11492515 -0.47933561 0.24843845 0.17376796 -0.15329481 0.23907848 0.1669713 -0.22250521 0.16734536 -0.22877426] [ 0.48475708 -0.21380482 0.06052074 -0.20350394 -0.40805036 0.51478765 0.05070471 0.04293477 0.08650258 -0.04241174 0.08473775 0.06938728 -0.35978034 0.11235744 -0.10010056 -0.25195266 0.01169352 0.0362775 ] [ 0.57185876 0.19438583 -0.42004218 -0.06684643 0.11610192 -0.2142974 -0.10792151 0.2269175 -0.07296344 0.25583771 -0.07363337 -0.05474684 0.10814941 0.30943174 -0.14470689 0.22501828 0.27116819 -0.00083959] [-0.43434325 0.008982 -0.17723867 -0.25606919 -0.28176567 0.13713258 0.04103321 -0.14873657 -0.01310716 -0.02310567 0.28759644 0.07332122 0.23341649 0.34636132 0.00791669 0.02353363 0.54971338 0.16196498] [-0.25571118 0.11551225 -0.23545965 -0.02869344 0.10808528 -0.11180473 0.16588856 -0.16952873 0.18784525 0.45407119 -0.12614524 0.15321484 -0.69135828 0.10753806 0.10460107 -0.03404546 0.01395487 0.04327362]]
print('The percentage of variation explained by each eigen Vector: ')
print(pca.explained_variance_ratio_)
The percentage of variation explained by each eigen Vector: [0.51988065 0.1660318 0.11043917 0.06658852 0.04947 0.02930666 0.01971839 0.0120373 0.0089896 0.00485871]
# Create new dataset with the trained PCA model
pca_data = pca.transform(normalized_x_train)
pca_data
array([[ 2.68537705e+00, 6.41161453e-01, 1.29124986e+01, ...,
-1.94939525e-02, -1.37447811e+00, 3.45141381e-01],
[-1.91473510e+00, 2.16057348e+00, 2.84060546e-02, ...,
-2.91902572e-02, 3.77666299e-01, 8.65318988e-02],
[ 3.69107270e+00, 2.42832137e+00, 1.49607509e+01, ...,
1.21706857e+00, 1.13547424e+00, 8.06770428e-02],
...,
[ 4.45594431e-02, -1.53445229e+00, -6.40693942e-02, ...,
-7.26459129e-01, -6.06129468e-02, -3.74090287e-02],
[-5.94390302e-01, 1.66176658e+00, -1.66653025e-01, ...,
4.05896761e-01, 1.06854042e-02, 3.03615712e-01],
[ 2.84326630e+00, 2.18778344e+00, 1.32909814e+01, ...,
7.33592301e-01, 5.08093260e-01, -5.56033007e-01]])
sns.pairplot(pd.DataFrame(pca_data), diag_kind = 'kde', corner = True);
C:\Users\fenuj\anaconda3\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
plt.figure(figsize=(10 , 5))
# plt.bar(range(1, e_vals.size + 1), var_exp, alpha = 0.5, align = 'center', label = 'Individual explained variance')
plt.step(range(1, e_vals.size + 1), cum_var_exp, where='mid', label = 'Cumulative explained variance')
plt.ylabel('Explained Variance Ratio')
plt.xlabel('Principal Components')
plt.legend(loc = 'best')
plt.tight_layout()
plt.show()
plt.figure(figsize=(10 , 5))
plt.step(range(1, e_vals.size + 1), cum_var_exp, where='mid', label = 'Cumulative explained variance')
plt.ylabel('Explained Variance Ratio')
plt.xlabel('Principal Components')
plt.legend(loc = 'best')
# Plot horizontal line
# matplotlib.pyplot.axhline(y, xmin, xmax, color, label)
plt.axhline(90, 0, 20, color='red', linestyle='dotted')
plt.tight_layout()
plt.show()
# Create PCA model with 5 components as 90% or more variance is explained by this
pca1 = PCA(n_components=5)
# Fit the training data
pca1.fit(normalized_x_train)
PCA(n_components=5)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA(n_components=5)
# Check out the new components
df_comp1 = pd.DataFrame(pca1.components_,columns=list(normalized_x_train))
df_comp1.head()
| compactness | circularity | distance_circularity | radius_ratio | pr.axis_aspect_ratio | max.length_aspect_ratio | scatter_ratio | elongatedness | pr.axis_rectangularity | max.length_rectangularity | scaled_variance | scaled_variance.1 | scaled_radius_of_gyration | scaled_radius_of_gyration.1 | skewness_about | skewness_about.1 | skewness_about.2 | hollows_ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.274888 | 0.293598 | 0.305711 | 0.266495 | 0.076370 | 0.091408 | 0.317433 | -0.314703 | 0.314289 | 0.283602 | 0.308902 | 0.313958 | 0.272633 | -0.021911 | 0.039859 | 0.059175 | 0.031228 | 0.075474 |
| 1 | -0.127067 | 0.122994 | -0.079504 | -0.187492 | -0.113499 | 0.010593 | 0.048796 | 0.012550 | 0.059539 | 0.114355 | 0.067432 | 0.055878 | 0.212453 | 0.484249 | -0.048729 | -0.131170 | -0.542112 | -0.542521 |
| 2 | -0.113153 | -0.026457 | -0.054280 | 0.280987 | 0.635259 | 0.591631 | -0.097273 | 0.058511 | -0.109356 | -0.022522 | 0.071807 | -0.107367 | -0.034399 | 0.297675 | -0.107633 | -0.071976 | 0.016783 | 0.046256 |
| 3 | 0.084399 | 0.179769 | -0.065280 | -0.036218 | 0.038596 | 0.026506 | -0.095282 | 0.082996 | -0.092100 | 0.177955 | -0.116870 | -0.090528 | 0.195978 | -0.062690 | 0.620639 | -0.658192 | 0.108360 | 0.048318 |
| 4 | 0.068182 | -0.095534 | 0.046048 | -0.041552 | -0.038952 | 0.191874 | -0.009093 | 0.074615 | 0.009736 | -0.078700 | 0.010981 | -0.010887 | -0.066677 | 0.152632 | 0.726120 | 0.605098 | -0.096631 | -0.037324 |
# Plot cumulative variances vs eigen values
plt.step(list(range(1,6)),np.cumsum(pca1.explained_variance_ratio_), where='mid')
plt.ylabel('Cum of variation explained')
plt.xlabel('eigen Value')
plt.show()
# Create new dataset with the trained PCA model
pca1_data = pca1.transform(normalized_x_train)
pca1_data
array([[ 2.68537705, 0.64116145, 12.91249858, -0.64761694, 0.64822572],
[-1.9147351 , 2.16057348, 0.02840605, -1.00360621, -0.44545286],
[ 3.6910727 , 2.42832137, 14.96075092, 0.77651727, 2.89508444],
...,
[ 0.04455944, -1.53445229, -0.06406939, -1.69880018, 0.15417026],
[-0.5943903 , 1.66176658, -0.16665303, -0.38560996, -0.53003688],
[ 2.8432663 , 2.18778344, 13.29098139, -0.88750177, 1.22038393]])
# Plot the resulting pairplot from reduced data
sns.pairplot(pd.DataFrame(pca1_data), diag_kind = 'kde', corner = True);
C:\Users\fenuj\anaconda3\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
# Building a Support Vector Machine on data from above step
svc2 = SVC(random_state = 111)
# Fit the training data
svc2.fit(pca1_data, y_train)
# check the accuracy on the training set
print("Training accuracy is: ",svc2.score(pca1_data, y_train))
Training accuracy is: 0.7973372781065089
svc2_y_pred_train = svc2.predict(pca1_data)
# Create list of classes in the target variable class
target_names = ['car','bus','van']
# Print Classification Report for training data
print(classification_report(y_train, svc2_y_pred_train, target_names=target_names))
precision recall f1-score support
car 0.82 0.70 0.75 181
bus 0.85 0.87 0.86 339
van 0.67 0.76 0.71 156
accuracy 0.80 676
macro avg 0.78 0.77 0.78 676
weighted avg 0.80 0.80 0.80 676
print("Confusion Matrix for Training Data with PCA")
# Calculate & Create the confusion matrix
cm=metrics.confusion_matrix(y_train, svc2_y_pred_train, labels=['car','bus','van'])
df_cm = pd.DataFrame(cm, index = [i for i in ['car','bus','van']],
columns = [i for i in ['car','bus','van']])
# Import plotting library
# import matplotlib.pyplot as plt
# %matplotlib inline
# Plot the confusion matrix
plt.figure(figsize = (7,5));
sns.heatmap(df_cm, annot=True, fmt = 'g');
Confusion Matrix for Training Data with PCA
# Manually tune the hyperparameters
clf = SVC(gamma=0.1, C=5, random_state=111, kernel='rbf', class_weight='balanced', verbose=3)
# Fit the training data
clf.fit(pca1_data, y_train)
clf_y_pred_train = clf.predict(pca1_data)
# Print Classification Report for training data
print(classification_report(y_train, clf_y_pred_train, target_names=target_names))
from sklearn.metrics import accuracy_score, confusion_matrix
# Confusion Matrix of training data
print("Confusion Matrix for Training Data with PCA")
# Calculate & Create the confusion matrix
cm=metrics.confusion_matrix(y_train, clf_y_pred_train, labels=['car','bus','van'])
df_cm = pd.DataFrame(cm, index = [i for i in ['car','bus','van']],
columns = [i for i in ['car','bus','van']])
# Plot the confusion matrix
plt.figure(figsize = (7,5));
sns.heatmap(df_cm, annot=True, fmt = 'g');
[LibSVM] precision recall f1-score support
car 0.83 0.86 0.85 181
bus 0.96 0.83 0.89 339
van 0.73 0.91 0.81 156
accuracy 0.86 676
macro avg 0.84 0.87 0.85 676
weighted avg 0.87 0.86 0.86 676
Confusion Matrix for Training Data with PCA
param_grid = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf'], }
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
grid.fit(pca1_data, y_train)
# print best parameter after tuning
print(grid.best_params_)
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)
{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
SVC(C=10, gamma=0.1)
grid_predictions = grid.predict(pca1_data)
# print classification report
print(classification_report(y_train, grid_predictions))
precision recall f1-score support
bus 0.89 0.86 0.87 181
car 0.91 0.94 0.92 339
van 0.84 0.82 0.83 156
accuracy 0.89 676
macro avg 0.88 0.87 0.87 676
weighted avg 0.89 0.89 0.89 676
print("Confusion Matrix for Training Data with PCA and GridSearchCV")
# Calculate & Create the confusion matrix
cm=metrics.confusion_matrix(y_train, grid_predictions, labels=['car','bus','van'])
df_cm = pd.DataFrame(cm, index = [i for i in ['car','bus','van']],
columns = [i for i in ['car','bus','van']])
# Plot the confusion matrix
plt.figure(figsize = (7,5));
sns.heatmap(df_cm, annot=True, fmt = 'g');
Confusion Matrix for Training Data with PCA and GridSearchCV